Personal Computer World 2005 October

home *** CD-ROM | disk | FTP | other *** search

/ Personal Computer World 2005 October / PCWOCT05.iso / Software / FromTheMag / XAMPP 1.4.14 / xampp-win32-1.4.14-installer.exe / xampp / php / pear / docs / tidy / examples / .tmpurlgrab.php < prev

Wrap

PHP Script | 2004-03-24 | 1.5 KB | 60 lines

<?php /* * urlgrab.php * * A simple command-line utility to extract all of the URLS contained * within <A HREF> tags from a document. * * By: John Coggeshall <john@php.net> * * Usage: php urlgrab.php <file> * */ /* Parse the document */ tidy_parse_file($_SERVER['argv'][1]); /* Fix up the document */ tidy_clean_repair(); /* Get an object representing everything from the <HTML> tag in */ $html = tidy_get_html(); /* Traverse the document tree */ print_r(get_links($html)); function get_links($node) { $urls = array(); /* Check to see if we are on an <A> tag or not */ if($node->id == TIDY_TAG_A) { /* If we are, find the HREF attribute */ $attrib = $node->get_attr(TIDY_ATTR_HREF); if($attrib) { /* Add the value of the HREF attrib to $urls */ $urls[] = $attrib->value; } } /* Are there any children? */ if($node->has_children()) { /* Traverse down each child recursively */ foreach($node->children() as $child) { /* Append the results from recursion to $urls */ foreach(get_links($child) as $url) { $urls[] = $url; } } } return $urls; } ?>